player_data <- data.frame() # initialize a data frame to dump the player data into
# get all player data from the 2000 season to the 2019 season
for (year in c(2000:2019)) {
url_player_totals <- paste("https://www.basketball-reference.com/leagues/NBA_",year,"_totals.html", sep = "")
url_player_advanced <- paste("https://www.basketball-reference.com/leagues/NBA_",year,"_advanced.html", sep = "")
# select player totals stats table by id
player_totals <- url_player_totals %>%
read_html() %>%
html_node("#totals_stats") %>%
html_table()
player_totals <- player_totals[colSums(!is.na(player_totals)) > 0] # delete empty columns
# select advanced player stats table by id
player_advanced <- url_player_advanced %>%
read_html() %>%
html_node("#advanced_stats") %>%
html_table()
player_advanced <- player_advanced[colSums(!is.na(player_advanced)) > 0] # delete empty columns
# join the player totals and player advanced tables
season_player_data <- join(player_totals, player_advanced, by = c("Player", "Rk", "Pos", "Age", "Tm", "G", "MP"), match = "first") %>% # join both data sets by Player
add_column(Year = year, .before = 1) %>% # add column for year
within(rm("Rk")) # remove unnecessary column
# append the current season's data to our dataframes for team and player statistics
player_data <- rbind(player_data,season_player_data) # bind this year's player data with the rest of the player data
# remove the unnecessary tables
rm(player_totals)
rm(player_advanced)
rm(season_player_data)
}
team_data <- data.frame() # initialize a data frame to dump the team data into
for (year in c(1:20)) {
url_team <- paste("http://www.nbaminer.com/nbaminer_nbaminer/four_factors.php?operation=eexcel&partitionpage=",year,"&partition2page=", sep = "")
season_colnames <- url_team %>%
read_html() %>%
html_nodes("table>tr:nth-child(1)>td") %>%
html_text()
season_data <- read_html(url_team) %>%
html_node("table") %>%
html_table() %>%
as_data_frame()
season_data <- season_data[-1,] # drop the first row which had the headers
season_data <- season_data %>%
magrittr::set_colnames(season_colnames) %>%
add_column(Year = 2020-year, .before = 1) # add column for year
# only keep cols we care about
season_data <- subset(season_data, select = c("Year","Team","Team EFg%","Team FT Rate","Team TO Rate","Team Off. Reb.%","Win","Loss")) %>%
# rename cols to match basketball statistics terms
rename(c('Team EFg%'='eFG%','Team FT Rate'='FT%','Team TO Rate'='TOV%','Team Off. Reb.%'='ORB%','Win'='W','Loss'='L')) %>%
type_convert(col_types = cols('W' = col_integer(), 'L' = col_integer())) # convert column types to allow division of wins and losses
season_data <- season_data %>%
add_column('W/L%' = season_data$'W' / (season_data$'W' + season_data$'L')) # add col for W/L%
season_data <- season_data %>%
arrange(desc(season_data$'W/L%')) %>%
add_column(Rk = seq(1:nrow(season_data))) # add column to show their rank that season
team_data <- rbind(team_data,season_data) # bind this year's team data with the rest of the team data
}
# remove the unnecessary tables
rm(season_data)
rm(season_colnames)
rm(year)
# remove extra header rows inside table
player_data <- player_data[!player_data$Player=="Player",]
# replace all empty entries with NA
team_data[team_data == ""] <- NA
player_data[player_data == ""] <- NA
player_data[player_data == "TOT"] <- "TOR" # Toronto Raptors mis-abbreviated at TOT
rownames(player_data) <- 1:nrow(player_data) # reset row names by indexes
# some years listed a "*" after a player's name if they were an All-Star, but it was not consistently documented for each year, so it will be removed to avoid confusion
player_data <- player_data %>%
mutate(Player = ifelse(substr(Player, nchar(Player), nchar(Player)) == "*", substr(Player, 1, nchar(Player)-1), Player))
# convert columns for player dataframe to appropriate types
player_data <- player_data %>%
type_convert(col_types = cols(
Year = col_integer(),
Player = col_character(),
Pos = col_character(),
Age = col_integer(),
Tm = col_character(),
G = col_integer(),
GS = col_integer(),
MP = col_integer(),
FG = col_integer(),
FGA = col_integer(),
'FG%' = col_double(),
'3P' = col_integer(),
'3PA' = col_integer(),
'3P%' = col_double(),
'2P' = col_integer(),
'2PA' = col_integer(),
'2P%' = col_double(),
'eFG%' = col_double(),
FT = col_integer(),
FTA = col_integer(),
'FT%' = col_double(),
ORB = col_integer(),
DRB = col_integer(),
TRB = col_integer(),
AST = col_integer(),
STL = col_integer(),
BLK = col_integer(),
TOV = col_integer(),
PF = col_integer(),
PTS = col_integer(),
PER = col_double(),
'TS%' = col_double(),
'3PAr' = col_double(),
FTr = col_double(),
'ORB%' = col_double(),
'DRB%' = col_double(),
'TRB%' = col_double(),
'AST%' = col_double(),
'STL%' = col_double(),
'BLK%' = col_double(),
'TOV%' = col_double(),
'USG%' = col_double(),
OWS = col_double(),
DWS = col_double(),
WS = col_double(),
'WS/48' = col_double(),
OBPM = col_double(),
DBPM = col_double(),
BPM = col_double(),
VORP = col_double()
)
)
# plot of teams' success over the years: single scatter plot of winning percentage over time
team_data %>%
ggplot(aes(x=team_data$Year, y=team_data$'W/L%', color=Team)) +
geom_point() +
geom_smooth(method='lm', se=FALSE) +
ggtitle("Team Winning Percentage Over Time \n Between 1999 and 2019") +
xlim(2000, 2019) +
xlab("Year") +
ylab("Winning Percentage") +
theme(legend.position="bottom", legend.key.size = unit(0.5, "cm"), legend.text = element_text(size=6))
# looking at the slope estimates of each of the team's regression lines
# linear regression model for W/L%, including a term for an interaction between Year and Team
interaction_model <- lm(team_data$'W/L%' ~Year*Team, data = team_data)
# cleaning up the data frame to present each team with the slope of their regression line and the p value
team_success_rates <- data.frame(term = broom::tidy(interaction_model)[,1], estimate = broom::tidy(interaction_model)[,2]) %>%
subset(grepl("Year.*",term))
team_success_rates <- team_success_rates %>%
mutate(term = gsub("Year:Team","",team_success_rates$term))
team_success_rates <- team_success_rates[with(team_success_rates, order(-estimate)),] %>% structure(row.names=seq(1:nrow(team_success_rates)))
team_success_rates <- team_success_rates %>%
mutate(term = gsub("^Year$","Atlanta Hawks",team_success_rates$term)) %>% # set 'Year' term to 'Atlanta Hawks' estimate since it is the default paramter, when the coefficients for all other teams are 0
arrange(desc(estimate))
team_success_rates
# residuals vs. year violin plot for the interaction model
broom::augment(interaction_model) %>%
ggplot(aes(x=factor(Year), y=.resid)) +
geom_violin() +
labs(title="Residuals vs. Year",
x = "year",
y = "residual")
# residuals vs. year violin plot for the interaction model
broom::augment(interaction_model) %>%
ggplot(aes(x=.fitted, y=.resid)) +
geom_point() +
geom_smooth(method = lm) +
labs(title="Residuals vs. Fitted",
x = "fitted values",
y = "residual")
filtered_teams <- team_data %>%
filter(Team=="Golden State Warriors" | Team=="Atlanta Hawks" | Team=="Los Angeles Clippers" | Team=="Vancouver Grizzlies" | Team=="Chicago Bulls" | Team=="Toronto Raptors" | Team=="Houston Rockets" | Team=="San Antonio Spurs" )
filtered_teams %>%
ggplot(aes(x=Year, y=filtered_teams$'W/L%', color=Team)) +
geom_smooth(method=lm, se = FALSE) +
ggtitle("Team Winning Percentage Over Time \n Between 1999 and 2019") +
xlim(2000, 2019) +
xlab("Year") +
ylab("Winning Percentage") +
theme(legend.position="bottom", legend.key.size = unit(0.5, "cm"), legend.text = element_text(size=6))
# add column to track how many times each team was the best team of the season
list_of_top_rank_teams <- ddply(team_data %>% filter(Rk == 1) %>% group_by(Team), "Team", numcolwise(sum)) %>% select(Rk, Team) %>% rename(c('Rk'='Best_Rank'))
team_success_rates <- merge(team_success_rates, list_of_top_rank_teams %>% rename(c('Team'='term')), by="term", all=T) %>%
mutate(Best_Rank = ifelse(is.na(Best_Rank), 0, Best_Rank))
# bar graph of #1 ranks by team
team_success_rates %>% arrange(desc(Best_Rank)) %>%
ggplot(mapping=aes(x=term, y=Best_Rank, color = term)) +
geom_bar(stat="identity") +
ggtitle("Count of Best Team Record by Team") +
ylab("Count of #1 Ranks") +
theme(axis.text.x = element_text(angle=90, vjust=0.5)) +
scale_x_discrete("Team", breaks=c("Golden State Warriors","Chicago Bulls","Cleveland Cavaliers","Dallas Mavericks","San Antonio Spurs","Boston Celtics","Detroit Pistons","Houston Rockets","Indiana Pacers","Los Angeles Lakers","Miami Heat","Milwaukee Bucks","Phoenix Suns","Sacramento Kings")) +
theme(legend.position="bottom", legend.key.size = unit(0.5, "cm"), legend.text = element_text(size=6)) +
scale_color_discrete(name="Team")
ggplot(team_data, aes(x=Year, y=team_data$'W/L%', color=Team)) +
geom_point() +
ggtitle("Win-Loss Perentage of NBA Teams Over Time\n(2000-2019)") +
xlab("Year") +
ylab("W/L%") +
scale_y_continuous(labels = scales::number_format(accuracy = 0.01, decimal.mark = '.')) +
theme(plot.title = element_text(lineheight=.8, face="bold", size = 20)) +
theme(text = element_text(size=18)) +
facet_wrap(~Team, nc=5) + # create faceted panel
theme(legend.position="none") # hide legend for facet plot
Year: Year the NBA season ended
Team: Team
eFG%: Effective Field Goal Percentage; this statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal
FT%: Free Throws Per Field Goal Attempt
TOV: Turnover Percentage; an estimate of turnovers committed per 100 plays
ORB%: Offensive Rebound Percentage; an estimate of the percentage of available offensive rebounds a player grabbed while he was on the floor
W: Wins
L: Losses
W/L%: Win-Loss Percentage
Rk: Rank
Year: Year the NBA season ended
Player: Name of player
Pos: Position
Age: Age; player age on February 1 of the given season
Tm: Team
G: Games
GS: Games Started
MP: Minutes Played
FG: Field Goals (includes both 2-point field goals and 3-point field goals)
FGA: Field Goal Attempts (includes both 2-point field goal attempts and 3-point field goal attempts)
FG%: ield Goal Percentage; the formula is FG / FGA
3P: 3-Point Field Goals
3PA: 3-Point Field Goal Attempts
3P%: 3-Point Field Goal Percentage; the formula is 3P / 3PA
2P: 2-Point Field Goals
2PA: 2-Point Field Goal Attempts
2P%: 2-Point Field Goal Percentage; the formula is 2P / 2PA
eFG%: Effective Field Goal Percentage; the formula is (FG + 0.5 * 3P) / FGA. This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal. For example, suppose Player A goes 4 for 10 with 2 threes, while Player B goes 5 for 10 with 0 threes. Each player would have 10 points from field goals, and thus would have the same effective field goal percentage (50%).
FT: Free Throws
FTA: Free Throw Attempts
FT%: Free Throw Percentage; the formula is FT / FTA
ORB: Offensive Rebounds
DRB: Defensive Rebounds
TRB: Total Rebounds
AST: Assists
STL: Steals
BLK: Blocks
TOV: Turnovers
PF: Personal Fouls
PTS: Points
PER: Player Efficiency Rating; PER is a rating developed by ESPN.com columnist John Hollinger. In John’s words, “The PER sums up all a player’s positive accomplishments, subtracts the negative accomplishments, and returns a per-minute rating of a player’s performance.”
TS%: True Shooting Percentage; the formula is PTS / (2 * TSA). True shooting percentage is a measure of shooting efficiency that takes into account field goals, 3-point field goals, and free throws.
3PAr: 3-Point Attempt Rate; Percentage of FG attempts from 3-Point Range
FTr: Free Throw Attempt Range; Number of FT Attempts Per FG Attempt
ORB%: Offensive Rebound Percentage; the formula is 100 * (ORB * (Tm MP / 5)) / (MP * (Tm ORB + Opp DRB)). Offensive rebound percentage is an estimate of the percentage of available offensive rebounds a player grabbed while he was on the floor.
DRB%: Defensive Rebound Percentage; the formula is 100 * (DRB * (Tm MP / 5)) / (MP * (Tm DRB + Opp ORB)). Defensive rebound percentage is an estimate of the percentage of available defensive rebounds a player grabbed while he was on the floor.
TRB%: Total Rebound Percentage (available since the 1970-71 season in the NBA); the formula is 100 * (TRB * (Tm MP / 5)) / (MP * (Tm TRB + Opp TRB)). Total rebound percentage is an estimate of the percentage of available rebounds a player grabbed while he was on the floor.
AST%: Assist Percentage (available since the 1964-65 season in the NBA); the formula is 100 * AST / (((MP / (Tm MP / 5)) * Tm FG) - FG). Assist percentage is an estimate of the percentage of teammate field goals a player assisted while he was on the floor.
STL%: Steal Percentage (available since the 1973-74 season in the NBA); the formula is 100 * (STL * (Tm MP / 5)) / (MP * Opp Poss). Steal Percentage is an estimate of the percentage of opponent possessions that end with a steal by the player while he was on the floor.
BLK%: Block Percentage (available since the 1973-74 season in the NBA); the formula is 100 * (BLK * (Tm MP / 5)) / (MP * (Opp FGA - Opp 3PA)). Block percentage is an estimate of the percentage of opponent two-point field goal attempts blocked by the player while he was on the floor.
TOV%: Turnover Percentage (available since the 1977-78 season in the NBA); the formula is 100 * TOV / (FGA + 0.44 * FTA + TOV). Turnover percentage is an estimate of turnovers per 100 plays.
USG%: Usage Percentage (available since the 1977-78 season in the NBA); the formula is 100 * ((FGA + 0.44 * FTA + TOV) * (Tm MP / 5)) / (MP * (Tm FGA + 0.44 * Tm FTA + Tm TOV)). Usage percentage is an estimate of the percentage of team plays used by a player while he was on the floor.
OWS: Offensive Win Shares
DWS: Defensive Win Shares
WS: Win Shares; an estimate of the number of wins contributed by a player
WS/48: Win Shares Per 48 Minutes (available since the 1951-52 season in the NBA); an estimate of the number of wins contributed by the player per 48 minutes (league average is approximately 0.100).
OBPM: Offensive Box Plus/Minus; A box score estimate of the offensive points per 100 possessions a player contributed above a league-average player, translated to an average team
DBPM: Defensive Box Plus/Minus; A box score estimate of the defensive points per 100 possessions a player contributed above a league-average player, translated to an average team
BPM: Box Plus/Minus; A box score estimate of the points per 100 possessions a player contributed above a league-average player, translated to an average team
VORP: Value Over Replacement Player; a box score estimate of the points per 100 TEAM possessions that a player contributed above a replacement-level (-2.0) player, translated to an average team and prorated to an 82-game season. Multiply by 2.70 to convert to wins over replacement.
“Basketball Reference.” 2019. Website. https://www.basketball-reference.com.
“NBA Miner.” 2019. Website. http://www.nbaminer.com/four-factors/.
“NBA Stuffer.” 2019. Website. https://www.nbastuffer.com/analytics101/four-factors/.